# Install and import packages
if(!require("ggplot2")) install.packages("ggplot2")
if(!require("tidyverse")) install.packages("tidyverse")
if(!require("dplyr")) install.packages("dplyr")
if(!require("glue")) install.packages("glue")
if(!require("scales")) install.packages("scales")
if(!require("psych")) install.packages("psych")
if(!require("plotly")) install.packages("plotly")
if(!require("readr")) install.packages("readr")
library(ggplot2)
library(tidyverse)
library(dplyr)
library(glue)
library(scales)
library(psych)
library(plotly)
library(readr)Mini Project 2
Introduction
For STA 9750 Mini Project 2, I’m going to propose remaking a classic movie: Independence Day.
Task 7: Pitch
Originally released in 1996 and starring Will Smith, this film depicts Earth’s fight for freedom from an alien invasion.
Starring Jacob Batalon in his breakout role as Captain Steven Hiller and famous robo-freedom-fighter John DiMaggio as David Levinson, this film follows an ensemble of characters as they look to save Earth from the new threat from the aliens of Omicron Persei 10.
Directed by Glass Onion and Knives Out director Rian Johnson, the action behind this film belies the mystery of the Omicronians and their true plot.
How we got here
Now that I’ve thoroughly grabbed your attention, how the heck did we come up with this cast of characters?
After sorting through IMDb data, these three just truly spoke to us and we knew we were on to something.
Task 0: In which the data is gathered
Since we’re doing this in R, let’s get some code going
Next let’s import our data
get_imdb_file <- function(fname){
BASE_URL <- "https://datasets.imdbws.com/"
fname_ext <- paste0(fname, ".tsv.gz")
if(!file.exists(fname_ext)){
FILE_URL <- paste0(BASE_URL, fname_ext)
download.file(FILE_URL,
destfile = fname_ext)
}
as.data.frame(readr::read_tsv(fname_ext, lazy=FALSE))
}
NAME_BASICS <- get_imdb_file("name.basics")
TITLE_BASICS <- get_imdb_file("title.basics")
TITLE_EPISODES <- get_imdb_file("title.episode")
TITLE_RATINGS <- get_imdb_file("title.ratings")
TITLE_CREW <- get_imdb_file("title.crew")
TITLE_PRINCIPALS <- get_imdb_file("title.principals")
NAME_BASICS <- NAME_BASICS |>
filter(str_count(knownForTitles, ",") > 1)Looking at the data we just pulled
TITLE_RATINGS |>
ggplot(aes(x=numVotes)) +
geom_histogram(bins=30) +
xlab("Number of IMDB Ratings") +
ylab("Number of Titles") +
ggtitle("Majority of IMDB Titles Have Less than 100 Ratings") +
theme_bw() +
scale_x_log10(label=scales::comma) +
scale_y_continuous(label=scales::comma)TITLE_RATINGS |>
pull(numVotes) |>
quantile() 0% 25% 50% 75% 100%
5 11 26 101 2952383
TITLE_RATINGS <- TITLE_RATINGS |>
filter(numVotes >= 100)
TITLE_BASICS <- TITLE_BASICS |>
semi_join(TITLE_RATINGS,
join_by(tconst == tconst))
TITLE_CREW <- TITLE_CREW |>
semi_join(TITLE_RATINGS,
join_by(tconst == tconst))
TITLE_EPISODES_1 <- TITLE_EPISODES |>
semi_join(TITLE_RATINGS,
join_by(tconst == tconst))
TITLE_EPISODES_2 <- TITLE_EPISODES |>
semi_join(TITLE_RATINGS,
join_by(parentTconst == tconst))
TITLE_EPISODES <- bind_rows(TITLE_EPISODES_1,
TITLE_EPISODES_2) |>
distinct()
TITLE_PRINCIPALS <- TITLE_PRINCIPALS |>
semi_join(TITLE_RATINGS, join_by(tconst == tconst))
rm(TITLE_EPISODES_1)
rm(TITLE_EPISODES_2)
#EDA
NAME_BASICS <- NAME_BASICS |>
mutate(birthYear = as.numeric(birthYear),
deathYear = as.numeric(deathYear))We have several tasks to complete with this data, including some EDA!
Task 1: Clean Clean Clean
convert_columns <- function(data, columns, conversion = "numeric") {
# Check if conversion is valid
if(!conversion %in% c("numeric", "logical")) {
stop("Invalid conversion type. Choose either 'numeric' or 'logical'.")
}
# Apply the conversion based on argument
if (conversion == "numeric") {
data <- data |>
mutate(across(all_of(columns), as.numeric))
} else if (conversion == "logical") {
data <- data |>
mutate(across(all_of(columns), as.logical))
}
return(data)
}
TITLE_BASICS <- convert_columns(TITLE_BASICS, columns = c("startYear","endYear","runtimeMinutes"), conversion="numeric")
TITLE_EPISODES <- convert_columns(TITLE_EPISODES, columns=c("seasonNumber","episodeNumber"), conversion="numeric")Task 2: Provided Questions, Provided Answers
Here we seek to answer several questions from the dataset.
#2.1: How many movies are in our data set? How many TV series? How many TV episodes?
# TITLE_BASICS will have this answer
glimpse(TITLE_BASICS)Rows: 374,231
Columns: 9
$ tconst <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt…
$ titleType <chr> "short", "short", "short", "short", "short", "short", "…
$ primaryTitle <chr> "Carmencita", "Le clown et ses chiens", "Poor Pierrot",…
$ originalTitle <chr> "Carmencita", "Le clown et ses chiens", "Pauvre Pierrot…
$ isAdult <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ startYear <dbl> 1894, 1892, 1892, 1892, 1893, 1894, 1894, 1894, 1894, 1…
$ endYear <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ runtimeMinutes <dbl> 1, 5, 5, 12, 1, 1, 1, 1, 45, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
$ genres <chr> "Documentary,Short", "Animation,Short", "Animation,Come…
table(TITLE_BASICS$titleType)
movie short tvEpisode tvMiniSeries tvMovie tvSeries
132246 16735 156768 5932 15048 29992
tvShort tvSpecial video videoGame
411 3064 9345 4690
# movie, tvEpisode and tvSeries
MOVIE_COUNT <- TITLE_BASICS |>
filter(`titleType` == "movie") |>
select(`tconst`) |>
unique() |>
count()
sprintf("There are %s movies in the data",format(MOVIE_COUNT,big.mark=",", scientific=FALSE))[1] "There are 132,246 movies in the data"
TV_SERIES_COUNT <- TITLE_BASICS |>
filter(`titleType` == "tvSeries") |>
select(`tconst`) |>
unique() |>
count()
sprintf("There are %s TV Series in the data",format(TV_SERIES_COUNT,big.mark=",", scientific=FALSE))[1] "There are 29,992 TV Series in the data"
TV_EPISODE_COUNT <- TITLE_BASICS |>
filter(`titleType` == "tvEpisode") |>
select(`tconst`) |>
unique() |>
count()
sprintf("There are %s TV Episodes in the data",format(TV_EPISODE_COUNT,big.mark=",", scientific=FALSE))[1] "There are 156,768 TV Episodes in the data"
#2.2: Oldest living person in data
glimpse(NAME_BASICS)Rows: 3,189,865
Columns: 6
$ nconst <chr> "nm0000001", "nm0000002", "nm0000003", "nm0000004", …
$ primaryName <chr> "Fred Astaire", "Lauren Bacall", "Brigitte Bardot", …
$ birthYear <dbl> 1899, 1924, 1934, 1949, 1918, 1915, 1899, 1924, 1925…
$ deathYear <dbl> 1987, 2014, NA, 1982, 2007, 1982, 1957, 2004, 1984, …
$ primaryProfession <chr> "actor,miscellaneous,producer", "actress,soundtrack,…
$ knownForTitles <chr> "tt0072308,tt0050419,tt0053137,tt0027125", "tt003738…
OLDEST <- NAME_BASICS |>
filter(is.na(`deathYear`)) |>
arrange(`birthYear`) |>
select(`primaryName`,`birthYear`,`deathYear`) |>
slice_head(n=10)
OLDEST primaryName birthYear deathYear
1 Traudl Lessing 1625 NA
2 Robert De Visée 1655 NA
3 William Sandys 1767 NA
4 Richard Dybeck 1811 NA
5 Albert Monnier 1815 NA
6 C. Hostrup 1818 NA
7 Edouard Martin 1825 NA
8 Gustav Lange 1830 NA
9 William S. Hooser 1836 NA
10 Ion Ivanovici 1845 NA
# Clearly this isn't the way to go about this
glimpse(TITLE_BASICS)Rows: 374,231
Columns: 9
$ tconst <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt…
$ titleType <chr> "short", "short", "short", "short", "short", "short", "…
$ primaryTitle <chr> "Carmencita", "Le clown et ses chiens", "Poor Pierrot",…
$ originalTitle <chr> "Carmencita", "Le clown et ses chiens", "Pauvre Pierrot…
$ isAdult <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ startYear <dbl> 1894, 1892, 1892, 1892, 1893, 1894, 1894, 1894, 1894, 1…
$ endYear <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ runtimeMinutes <dbl> 1, 5, 5, 12, 1, 1, 1, 1, 45, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
$ genres <chr> "Documentary,Short", "Animation,Short", "Animation,Come…
glimpse(NAME_BASICS)Rows: 3,189,865
Columns: 6
$ nconst <chr> "nm0000001", "nm0000002", "nm0000003", "nm0000004", …
$ primaryName <chr> "Fred Astaire", "Lauren Bacall", "Brigitte Bardot", …
$ birthYear <dbl> 1899, 1924, 1934, 1949, 1918, 1915, 1899, 1924, 1925…
$ deathYear <dbl> 1987, 2014, NA, 1982, 2007, 1982, 1957, 2004, 1984, …
$ primaryProfession <chr> "actor,miscellaneous,producer", "actress,soundtrack,…
$ knownForTitles <chr> "tt0072308,tt0050419,tt0053137,tt0027125", "tt003738…
NAME_BASICS |> separate_longer_delim(knownForTitles, ",") |>
slice_head(n=5) nconst primaryName birthYear deathYear
1 nm0000001 Fred Astaire 1899 1987
2 nm0000001 Fred Astaire 1899 1987
3 nm0000001 Fred Astaire 1899 1987
4 nm0000001 Fred Astaire 1899 1987
5 nm0000002 Lauren Bacall 1924 2014
primaryProfession knownForTitles
1 actor,miscellaneous,producer tt0072308
2 actor,miscellaneous,producer tt0050419
3 actor,miscellaneous,producer tt0053137
4 actor,miscellaneous,producer tt0027125
5 actress,soundtrack,archive_footage tt0037382
TITLES_AND_NAMES <- NAME_BASICS |>
separate_longer_delim(knownForTitles, delim = ",") |>
inner_join(TITLE_BASICS, by = c("knownForTitles" = "tconst")) |>
filter(is.na(deathYear),birthYear >= 1908) |> #according to google, the current oldest person alive was born in 1908
arrange(birthYear) |>
select(primaryName, birthYear, deathYear) |>
unique() |>
slice_head(n = 5)
head(TITLES_AND_NAMES) # Because there are too many NULL deathYear values, this question can't really be answered primaryName birthYear deathYear
1 Robert Aisner 1908 NA
2 Ugo Amadoro 1908 NA
3 Hanna Bark 1908 NA
4 Harry Belock 1908 NA
5 Aida Broadbent 1908 NA
#2.3: There is one TV Episode in this data set with a perfect 10/10 rating and at least 200,00 IMDb ratings. What is it? Which series does it belong to?
glimpse(TITLE_RATINGS)Rows: 374,231
Columns: 3
$ tconst <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt0…
$ averageRating <dbl> 5.7, 5.6, 6.5, 5.4, 6.2, 5.0, 5.4, 5.4, 5.4, 6.8, 5.2, 7…
$ numVotes <dbl> 2096, 283, 2104, 183, 2839, 197, 889, 2243, 215, 7728, 4…
HIGHEST_RATED <- TITLE_RATINGS |>
filter(numVotes >= 200000) |>
slice_max(order_by = averageRating) |>
left_join(TITLE_EPISODES, by = c("tconst" = "tconst")) |>
inner_join(TITLE_BASICS, by = c("tconst" = "tconst")) |>
left_join(TITLE_BASICS, by = c("parentTconst" = "tconst")) |>
select(
filmID = `tconst`,
seriesName = `primaryTitle.y`,
episodeName = `primaryTitle.x`,
`seasonNumber`,
`episodeNumber`,
yearAired = `startYear.x`,
`averageRating`,
`numVotes`
)
head(HIGHEST_RATED) # Unsurprisingly, it's a season 5 episode of Breaking Bad, specifically, the one where Hank gets got filmID seriesName episodeName seasonNumber episodeNumber yearAired
1 tt2301451 Breaking Bad Ozymandias 5 14 2013
averageRating numVotes
1 10 229933
If anyone’s interested in a famous scene (and subsequent meme) from this episode (Warning: violence and spoilers)
#2.4: Which 4 projects is actor Mark Hamill most known for?
# Just guessing beforehand: Star War IV, V, VI, and VA for the Joker in the Batman animated series
MARK_HAMILL <- NAME_BASICS |>
filter(primaryName == "Mark Hamill")
MARK_HAMILL # going to guess the record with multiple titles is the correct Mark Hamill nconst primaryName birthYear deathYear primaryProfession
1 nm0000434 Mark Hamill NA NA actor,producer,director
knownForTitles
1 tt0076759,tt2527336,tt0080684,tt0086190
MARK_HAMILL <- NAME_BASICS |>
filter(nconst == "nm0000434") |>
separate_longer_delim(knownForTitles, delim = ",") |>
inner_join(TITLE_BASICS, by = c("knownForTitles" = "tconst")) |>
select(actorID = `nconst`,
`primaryName`,
`primaryTitle`)
print(MARK_HAMILL) actorID primaryName primaryTitle
1 nm0000434 Mark Hamill Star Wars: Episode IV - A New Hope
2 nm0000434 Mark Hamill Star Wars: Episode VIII - The Last Jedi
3 nm0000434 Mark Hamill Star Wars: Episode V - The Empire Strikes Back
4 nm0000434 Mark Hamill Star Wars: Episode VI - Return of the Jedi
Apparently, Star Wars Episode VIII gets a listing before his Joker VA. Hard disagree, but I digress.
#2.5: What TV series, with more than 12 episodes, has the highest average rating?
glimpse(TITLE_EPISODES)Rows: 3,023,489
Columns: 4
$ tconst <chr> "tt0045960", "tt0046855", "tt0048378", "tt0048562", "tt0…
$ parentTconst <chr> "tt0044284", "tt0046643", "tt0047702", "tt0047768", "tt0…
$ seasonNumber <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 8, 1, 10, 6, 2, 8, 4…
$ episodeNumber <dbl> 3, 4, 6, 10, 4, 20, 5, 2, 20, 6, 2, 3, 2, 10, 17, 5, 1, …
glimpse(TITLE_BASICS)Rows: 374,231
Columns: 9
$ tconst <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt…
$ titleType <chr> "short", "short", "short", "short", "short", "short", "…
$ primaryTitle <chr> "Carmencita", "Le clown et ses chiens", "Poor Pierrot",…
$ originalTitle <chr> "Carmencita", "Le clown et ses chiens", "Pauvre Pierrot…
$ isAdult <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ startYear <dbl> 1894, 1892, 1892, 1892, 1893, 1894, 1894, 1894, 1894, 1…
$ endYear <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ runtimeMinutes <dbl> 1, 5, 5, 12, 1, 1, 1, 1, 45, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
$ genres <chr> "Documentary,Short", "Animation,Short", "Animation,Come…
glimpse(TITLE_RATINGS)Rows: 374,231
Columns: 3
$ tconst <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt0…
$ averageRating <dbl> 5.7, 5.6, 6.5, 5.4, 6.2, 5.0, 5.4, 5.4, 5.4, 6.8, 5.2, 7…
$ numVotes <dbl> 2096, 283, 2104, 183, 2839, 197, 889, 2243, 215, 7728, 4…
TWELVE_EP_SERIES <- TITLE_EPISODES |>
group_by(parentTconst) |>
summarize(episodeCount = n()) |>
filter(episodeCount >= 12) |>
inner_join(TITLE_RATINGS,c("parentTconst" = "tconst")) |>
left_join(TITLE_BASICS,c("parentTconst" = "tconst")) |>
select(
seriesID = `parentTconst`,
`primaryTitle`,
`startYear`,
`endYear`,
`episodeCount`,
`averageRating`,
`numVotes`
) |>
arrange(desc(averageRating), desc(episodeCount), desc(numVotes)) |>
slice_head(n=5)
TWELVE_EP_SERIES# A tibble: 5 × 7
seriesID primaryTitle startYear endYear episodeCount averageRating numVotes
<chr> <chr> <dbl> <dbl> <int> <dbl> <dbl>
1 tt15613780 Craft Games 2014 NA 318 9.7 150
2 tt8560108 Prime Time 2016 NA 212 9.7 181
3 tt21989170 Gumbino 2016 NA 148 9.7 343
4 tt7151672 Choufli Hal 2005 2009 134 9.7 2930
5 tt23028046 Jogandofodda… 2019 2021 101 9.7 168
No clue what these shows are
#2.6: The TV Series Happy Days (1974-1984) gives us the common idiom "jump the shark". The phrase comes from
# a controversial fifth season episode (aired in 1977) in which a lead character literally jumped over a shark on
# water skis. Idiomatically, it is used to refer to the moment when a once-great show becomes ridiculous and
# rapidly loses quality. Is it true that episodes from later seasons of Happy Days have lower
# average ratings than the early seasons?
# First find the ID for Happy Days
TITLE_BASICS |> filter(primaryTitle == "Happy Days",startYear==1974,endYear==1984) |> select(`tconst`,`titleType`,`primaryTitle`) tconst titleType primaryTitle
1 tt0070992 tvSeries Happy Days
glimpse(TITLE_EPISODES)Rows: 3,023,489
Columns: 4
$ tconst <chr> "tt0045960", "tt0046855", "tt0048378", "tt0048562", "tt0…
$ parentTconst <chr> "tt0044284", "tt0046643", "tt0047702", "tt0047768", "tt0…
$ seasonNumber <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 8, 1, 10, 6, 2, 8, 4…
$ episodeNumber <dbl> 3, 4, 6, 10, 4, 20, 5, 2, 20, 6, 2, 3, 2, 10, 17, 5, 1, …
HAPPY_DAYS <- TITLE_EPISODES |>
filter(parentTconst=="tt0070992") |>
left_join(TITLE_RATINGS,c("tconst" = "tconst")) |>
select(
-`numVotes`,
-`parentTconst`,
-`tconst`
) |>
group_by(seasonNumber) |>
summarize(avgRating = mean(averageRating,na.rm = TRUE))
HAPPY_DAYS# A tibble: 11 × 2
seasonNumber avgRating
<dbl> <dbl>
1 1 7.58
2 2 7.69
3 3 7.7
4 4 7.43
5 5 7
6 6 7.02
7 7 6.33
8 8 5.3
9 9 6.4
10 10 6.7
11 11 7.33
min_point <- HAPPY_DAYS[which.min(HAPPY_DAYS$avgRating), ]
max_point <- HAPPY_DAYS[which.max(HAPPY_DAYS$avgRating), ]
sharkjump <- HAPPY_DAYS[HAPPY_DAYS$seasonNumber == 5, ]
highlighted_points <- factor(c("Lowest Rating", "Highest Rating", "Jumped Shark"),
levels = c("Lowest Rating", "Highest Rating", "Jumped Shark")) # Set the order here
# Plot ratings over time
ggplot(HAPPY_DAYS, aes(x = seasonNumber, y = avgRating)) +
geom_line(size=1) +
geom_point(color = "#D35400",size=2) +
# Add custom points for min, max, and season 5
geom_point(data = min_point, aes(color = highlighted_points[1]), size = 3) +
geom_point(data = max_point, aes(color = highlighted_points[2]), size = 3) +
geom_point(data = sharkjump, aes(color = highlighted_points[3]), size = 3) +
# Add labels and formatting
labs(title = "Happy Days Average Rating by Season", x = "Season", y = "Average Rating") +
# Customize ticks for x and y axes
scale_x_continuous(breaks = seq(min(HAPPY_DAYS$seasonNumber), max(HAPPY_DAYS$seasonNumber), by = 1)) +
scale_y_continuous(labels = label_number(), breaks = seq(floor(min(HAPPY_DAYS$avgRating)), ceiling(max(HAPPY_DAYS$avgRating)), by = 0.5)) +
# Manually define colors for the highlighted points
scale_color_manual(
values = c("Lowest Rating" = "blue", "Highest Rating" = "red", "Jumped Shark" = "green"),
name = ""
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
axis.text = element_text(size = 12),
# panel.grid.minor = element_blank(),
legend.position = "right", # Position the legend on the right
legend.text = element_text(face = "bold", size = 8),
# Set the background colors
panel.background = element_rect(fill = "gray90"), # Gray background for the plot area
plot.background = element_rect(fill = "gray90"), # Gray background for the entire plot
# Enhance axis lines
axis.line = element_line(color = "black", size = 1.2), # Change color and thickness of axis lines
axis.ticks = element_line(color = "black", size = 1), # Change color and thickness of tick marks
axis.ticks.length = unit(0.25, "cm"), # Adjust length of tick marks
# Enhance gridlines
panel.grid.major = element_line(color = "gray60", size = 0.5), # Change color and thickness of major gridlines
panel.grid.minor = element_line(color = "gray60", size = 0.5) # Change color and thickness of minor gridlines
)As we can see, the shark-jump likely caused an immediate dip in viewership, but it rebounded for the final 3 seasons and went out near the height of its popularity
Task 3: What is success?
Here we create a “success” metric to determine whether or not a film meets our own standards.
First let’s define success. We want to look at two features: * Average Rating * Num Votes Other features of note could be * Crew Size * Run Time
# First let's just get only movie data from 1960 onwards
MOVIES <- TITLE_BASICS |>
filter(titleType == "movie",startYear >= 1960)
# Add in ratings
MOVIES <- MOVIES |>
left_join(TITLE_RATINGS, c("tconst" = "tconst")) |>
select(
`tconst`,
`primaryTitle`,
`isAdult`,
releaseYear = `startYear`,
`runtimeMinutes`,
`genres`,
`averageRating`,
`numVotes`
)
# Add in crew size
CREWS <- TITLE_PRINCIPALS |>
group_by(tconst) |>
summarize(castCount = n())
MOVIES <- MOVIES |>
left_join(CREWS, c("tconst" = "tconst"))
# Count the number of genres in the 'genres' column
GENRE_COUNT <- MOVIES |>
separate_longer_delim(genres, delim = ",") |> # Split into multiple rows by delimiter
group_by(tconst) |> # Group by title to keep track of original rows
summarise(genre_count = n(), .groups = 'drop') # Count the number of genres
# Join back to the original MOVIES dataframe if needed
MOVIES <- MOVIES |>
left_join(GENRE_COUNT, by = "tconst")
# summary stats
describe(MOVIES) vars n mean sd median trimmed mad min
tconst* 1 118505 59253.00 34209.59 59253.0 59253.00 43923.51 1
primaryTitle* 2 118505 53767.12 30890.58 53903.0 53812.98 39907.14 1
isAdult 3 118505 0.01 0.09 0.0 0.00 0.00 0
releaseYear 4 118505 2004.03 16.86 2010.0 2006.16 13.34 1960
runtimeMinutes 5 116399 101.65 202.99 96.0 98.24 14.83 17
genres* 6 118505 579.73 274.72 633.0 599.44 201.63 1
averageRating 7 118505 5.88 1.32 6.1 5.95 1.19 1
numVotes 8 118505 9392.56 58913.55 473.0 1139.48 498.15 100
castCount 9 114101 18.47 4.11 19.0 18.70 2.97 1
genre_count 10 118505 2.00 0.83 2.0 2.00 1.48 1
max range skew kurtosis se
tconst* 118505 118504 0.00 -1.20 99.38
primaryTitle* 107056 107055 -0.01 -1.21 89.73
isAdult 1 1 10.77 113.99 0.00
releaseYear 2024 64 -0.94 -0.17 0.05
runtimeMinutes 51420 51403 224.22 52765.55 0.59
genres* 1010 1009 -0.66 -0.48 0.80
averageRating 10 9 -0.52 0.17 0.00
numVotes 2952383 2952283 17.20 456.62 171.14
castCount 57 56 -0.44 2.86 0.01
genre_count 3 2 0.00 -1.55 0.00
# 90% of movies fall between an average rating of 3.4 and 7.7
quantile(MOVIES$averageRating,.05) 5%
3.4
quantile(MOVIES$averageRating,.95)95%
7.7
# We can use this to determine both what makes a great move and what makes a flop
FLOPS <- MOVIES |>
filter(averageRating <= 3.4)
SUCCESSES <- MOVIES |>
filter(averageRating >= 7.7)
describe(FLOPS) vars n mean sd median trimmed mad min max
tconst* 1 6162 3081.50 1778.96 3081.5 3081.50 2283.95 1 6162.0
primaryTitle* 2 6162 3055.68 1762.50 3054.5 3056.01 2261.71 1 6107.0
isAdult 3 6162 0.00 0.05 0.0 0.00 0.00 0 1.0
releaseYear 4 6162 2010.13 12.33 2013.0 2012.26 8.90 1960 2024.0
runtimeMinutes 5 5952 98.85 559.05 89.0 89.42 10.38 26 43200.0
genres* 6 6162 226.28 128.67 239.0 233.78 170.50 1 396.0
averageRating 7 6162 2.82 0.52 2.9 2.88 0.44 1 3.4
numVotes 8 6162 1333.77 6306.58 334.0 468.67 269.83 100 180239.0
castCount 9 5886 18.30 3.86 18.0 18.36 2.97 1 49.0
genre_count 10 6162 1.94 0.84 2.0 1.92 1.48 1 3.0
range skew kurtosis se
tconst* 6161.0 0.00 -1.20 22.66
primaryTitle* 6106.0 0.00 -1.20 22.45
isAdult 1.0 20.90 435.00 0.00
releaseYear 64.0 -1.67 2.81 0.16
runtimeMinutes 43174.0 76.99 5933.03 7.25
genres* 395.0 -0.36 -1.27 1.64
averageRating 2.4 -1.09 0.74 0.01
numVotes 180139.0 14.03 256.60 80.34
castCount 48.0 0.03 3.71 0.05
genre_count 2.0 0.12 -1.58 0.01
describe(SUCCESSES) vars n mean sd median trimmed mad min
tconst* 1 7224 3612.50 2085.53 3612.5 3612.50 2677.58 1.0
primaryTitle* 2 7224 3574.23 2063.38 3573.5 3573.97 2651.63 1.0
isAdult 3 7224 0.00 0.05 0.0 0.00 0.00 0.0
releaseYear 4 7224 2004.91 17.78 2011.0 2007.24 14.83 1960.0
runtimeMinutes 5 6930 115.03 42.59 109.0 111.80 31.13 17.0
genres* 6 7224 261.63 111.20 295.0 272.18 94.89 1.0
averageRating 7 7224 8.10 0.42 8.0 8.03 0.30 7.7
numVotes 8 7224 44528.35 188879.33 699.0 3525.70 856.94 100.0
castCount 9 6892 16.43 5.12 17.0 16.71 4.45 1.0
genre_count 10 7224 1.90 0.84 2.0 1.88 1.48 1.0
max range skew kurtosis se
tconst* 7224 7223.0 0.00 -1.20 24.54
primaryTitle* 7150 7149.0 0.00 -1.20 24.28
isAdult 1 1.0 21.17 446.38 0.00
releaseYear 2024 64.0 -0.93 -0.26 0.21
runtimeMinutes 1440 1423.0 6.95 157.15 0.51
genres* 446 445.0 -0.79 -0.15 1.31
averageRating 10 2.3 1.51 2.30 0.00
numVotes 2952383 2952283.0 6.83 59.67 2222.26
castCount 38 37.0 -0.43 0.53 0.06
genre_count 3 2.0 0.18 -1.55 0.01
# Successes seem to be longer on average but also confined to a range of 1,440 minutes runtime while flops get a little silly
# A lot more people vote for successes (surprising because you'd think people are more likely to respond if they dislike something than if they like it)
# https://www.nationalstrategic.com/why-would-they-write-that-the-psychology-of-customer-reviews/
# The number of genres used seem to not matter pretty much at all
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
Mode(SUCCESSES$genres)[1] "Documentary"
Mode(FLOPS$genres)[1] "Horror"
head(sort(desc(table(SUCCESSES$genres))),n=20)
Documentary Drama
-1083 -1040
Comedy Comedy,Drama
-289 -248
Documentary,Music Drama,Romance
-241 -205
Biography,Documentary Comedy,Drama,Romance
-119 -112
Action,Crime,Drama Documentary,Sport
-105 -105
Documentary,History Thriller
-96 -90
Crime,Drama,Thriller Biography,Documentary,Music
-81 -79
Drama,Thriller Crime,Drama
-79 -77
Drama,Family \\N
-76 -70
Biography,Documentary,History Drama,War
-66 -65
head(sort(desc(table(FLOPS$genres))),n=20)
Horror Comedy Horror,Thriller
-956 -598 -334
Drama Thriller Action
-199 -156 -150
Comedy,Horror Comedy,Drama Horror,Mystery,Thriller
-135 -88 -87
Horror,Sci-Fi Sci-Fi Action,Adventure,Comedy
-84 -84 -76
Drama,Horror,Mystery Action,Sci-Fi Comedy,Romance
-74 -73 -73
Drama,Horror,Thriller Action,Crime,Drama Documentary
-70 -67 -65
Action,Drama Action,Thriller
-64 -62
table(SUCCESSES$isAdult)
0 1
7208 16
table(FLOPS$isAdult)
0 1
6148 14
# no effectDocumentaries and Dramas are successfull while Horror is not. Comedy can go either way.
Metrics for success: * Good genre (Drama, Documentary) * High number of votes (> 500) * Runtime between 70 minutes and 150 minutes. Dock points for each STD outside of that Metrics for flop: * Bad genre (Horror, Comedy) * Low number of votes (<= 500) * Runtime greater than 200 minutes
# Define the metric for success function
metric_for_success <- function(df) {
df <- df |>
mutate(
# Calculate positive contributions
positive_metric = ifelse(str_detect(genres, "Drama|Documentary"), 0.2, 0) +
ifelse(numVotes >= 500, 0.4, 0) +
ifelse(runtimeMinutes >= 70 & runtimeMinutes <= 150, 0.4, 0),
# Calculate negative contributions
negative_metric = ifelse(str_detect(genres, "Horror|Comedy"), -0.4, 0) +
ifelse(numVotes < 500 | is.na(numVotes), -0.3, 0) +
ifelse(runtimeMinutes > 200, -pmin(0.5, (runtimeMinutes - 200) / 90), 0),
# Combine the positive and negative contributions
metric = positive_metric + negative_metric,
# Bound the metric between -1 and 1
metric = pmin(pmax(metric, -1), 1),
# Round the metric to 3 decimal places
metric = round(metric, 3)
) |>
# Drop the intermediate columns (optional)
select(-positive_metric, -negative_metric)
return(df)
}
#Apply the metric function to the DataFrame
MOVIES <- metric_for_success(MOVIES)
table(MOVIES$metric)
-1 -0.944 -0.8 -0.767 -0.722 -0.7 -0.611 -0.6 -0.567 -0.544 -0.533
9 1 8 1 1 642 1 59 1 5 1
-0.5 -0.478 -0.467 -0.444 -0.433 -0.411 -0.4 -0.389 -0.378 -0.367 -0.356
293 2 1 1 1 2 3 1 2 2 3
-0.344 -0.333 -0.322 -0.311 -0.3 -0.289 -0.278 -0.267 -0.256 -0.244 -0.233
1 2 5 2 15056 1 2 2 1 1 1
-0.222 -0.211 -0.189 -0.167 -0.144 -0.122 -0.111 -0.1 -0.067 -0.056 -0.033
1 9 3 2 1 1 5 8398 2 1 1
-0.022 0 0.022 0.067 0.1 0.111 0.133 0.156 0.167 0.178 0.189
1 420 1 1 7901 1 2 4 1 4 2
0.2 0.211 0.233 0.244 0.256 0.267 0.278 0.3 0.311 0.322 0.333
467 1 1 2 1 2 1 26557 2 2 1
0.344 0.367 0.378 0.389 0.4 0.411 0.422 0.433 0.444 0.456 0.489
1 1 6 2 16669 4 2 2 2 3 6
0.5 0.511 0.522 0.533 0.544 0.556 0.567 0.578 0.589 0.6 0.8
1 5 2 1 2 1 1 4 3 10527 6349
1
22892
#3.1: Choose the top 5-10 movies by my metric to confirm successes
MOVIES |>
filter(metric==1) |>
select(
-`isAdult`,
-`tconst`,
-`genre_count`
) |>
sample_n(10) primaryTitle releaseYear runtimeMinutes genres
1 The Last Lullaby 2008 93 Drama,Mystery
2 Leto 2018 126 Biography,Drama,History
3 A Girl Returned 2021 110 Drama
4 Intrusion 2021 92 Action,Drama,Mystery
5 Night of the Juggler 1980 101 Action,Crime,Drama
6 Children of Heaven 1997 89 Drama,Family,Sport
7 Sunny 2021 93 Crime,Drama,Thriller
8 Close 2022 104 Drama
9 Lebanon 2009 93 Drama,War
10 Farewell to the King 1989 117 Action,Adventure,Drama
averageRating numVotes castCount metric
1 6.1 1023 20 1
2 7.3 8076 24 1
3 6.9 500 21 1
4 5.3 21930 24 1
5 6.5 1128 20 1
6 8.2 82639 17 1
7 6.7 6429 16 1
8 7.8 39014 NA 1
9 6.9 12060 22 1
10 6.2 3176 22 1
# This metric works really well in conjunction with a high number of votes
MOVIES |>
arrange(desc(metric),desc(numVotes)) |>
slice_head(n=10) tconst primaryTitle isAdult releaseYear runtimeMinutes
1 tt0111161 The Shawshank Redemption 0 1994 142
2 tt0137523 Fight Club 0 1999 139
3 tt0109830 Forrest Gump 0 1994 142
4 tt0114369 Se7en 0 1995 127
5 tt0372784 Batman Begins 0 2005 140
6 tt0102926 The Silence of the Lambs 0 1991 118
7 tt7286456 Joker 0 2019 122
8 tt1130884 Shutter Island 0 2010 138
9 tt0482571 The Prestige 0 2006 130
10 tt0099685 Goodfellas 0 1990 145
genres averageRating numVotes castCount genre_count metric
1 Drama 9.3 2952383 19 1 1
2 Drama 8.8 2384372 23 1 1
3 Drama,Romance 8.8 2309923 21 2 1
4 Crime,Drama,Mystery 8.6 1843387 21 3 1
5 Action,Crime,Drama 8.2 1607729 26 3 1
6 Crime,Drama,Thriller 8.6 1583671 21 3 1
7 Crime,Drama,Thriller 8.4 1548918 23 3 1
8 Drama,Mystery,Thriller 8.2 1496563 21 3 1
9 Drama,Mystery,Sci-Fi 8.5 1471339 22 3 1
10 Biography,Crime,Drama 8.7 1285009 19 3 1
#3.2: Choose 3-5 movies with large numbers of IMDb votes that socre poorly on your success metric and confirm
# that they are indeed low quality
MOVIES |>
arrange(metric, desc(numVotes)) |>
select(`primaryTitle`,
`releaseYear`,
`numVotes`,
`averageRating`,
`genres`,
`metric`) |>
slice_head(n=5) primaryTitle releaseYear numVotes averageRating
1 Out 1: Spectre 1972 437 7.2
2 Three Mirrors Creature's Flashes of Flesh 2023 422 7.8
3 The Cremaster Cycle 2003 306 6.9
4 Haripada Bandwala 2016 290 4.4
5 The Movie Orgy 2009 211 7.8
genres metric
1 Comedy,Drama,Thriller -1
2 Drama,Horror,Sci-Fi -1
3 Drama,Fantasy,Horror -1
4 Comedy,Romance -1
5 Comedy,Documentary,History -1
I didn’t need a personal metric to tell me that these 5 movies suck
#3.3 Choose a prestige actor or director and confirm that they have many projects with high scores on your success metric.
NAME_BASICS |>
filter(primaryName == "Stanley Kubrick") |>
separate_longer_delim(knownForTitles, delim = ",") |>
inner_join(MOVIES, by = c("knownForTitles" = "tconst")) |>
select(
`primaryName`,
`primaryTitle`,
`averageRating`,
`numVotes`,
`genres`,
`runtimeMinutes`,
`metric`
) primaryName primaryTitle averageRating numVotes
1 Stanley Kubrick 2001: A Space Odyssey 8.3 733226
2 Stanley Kubrick A Clockwork Orange 8.2 894278
3 Stanley Kubrick Barry Lyndon 8.1 187061
4 Stanley Kubrick Full Metal Jacket 8.2 804176
genres runtimeMinutes metric
1 Adventure,Sci-Fi 149 0.8
2 Crime,Sci-Fi 136 0.8
3 Adventure,Drama,War 185 0.6
4 Drama,War 116 1.0
Pretty good
#3.4 Perform at least one other "spot check" validation
counts <- MOVIES %>%
summarize(
above_7_5_high_metric = sum(averageRating > 7.5 & metric >= 0.6, na.rm = TRUE),
between_5_and_7_5_high_metric = sum(averageRating >= 5 & averageRating <= 7.5 & metric >= 0.6, na.rm = TRUE),
below_5_high_metric = sum(averageRating < 5 & metric >= 0.6, na.rm = TRUE),
above_5_and_7_5_low_metric = sum(averageRating > 7.5 & metric <= 0.3, na.rm = TRUE),
between_5_and_7_5_low_metric = sum(averageRating >= 5 & averageRating <= 7.5 & metric <= 0.3, na.rm = TRUE),
below_5_low_metric = sum(averageRating < 5 & metric <= 0.3, na.rm = TRUE)
)
counts above_7_5_high_metric between_5_and_7_5_high_metric below_5_high_metric
1 4229 30767 4772
above_5_and_7_5_low_metric between_5_and_7_5_low_metric below_5_low_metric
1 3766 39965 16177
Although the metric is imperfect, it is pretty good at telling us what’s good and what’s not (based on average ratings)
#3.5: Come up with a numerical threshold for a project to be a ‘success’; that is, determine a value such that movies above `v` are all “solid” or better.
v <- 0.6Task 4: What makes a good movie?
# Juuuust incase
avgrtgthrsh <- 6.0
# Create a new column indicating success based on averageRating
MOVIES <- MOVIES |>
mutate(success_averageRating = ifelse(averageRating >= avgrtgthrsh, 1, 0),
success_metric = ifelse(metric >= v, 1, 0)) |>
mutate(decade = floor(releaseYear / 10) * 10)
MOVIES_GENRES <- MOVIES |>
filter(genres != "\\N", averageRating != "\\N") |>
separate_longer_delim(genres, delim = ",") |>
mutate(genres = trimws(genres)) |>
mutate(decade = floor(releaseYear / 10) * 10)
# Calculate total films by genre and decade
total_films_by_genre <- MOVIES_GENRES |>
group_by(decade, genres) |>
summarize(total_count = n(), .groups = "drop")
# Filter genres by count
popular_genres <- total_films_by_genre |>
filter(total_count > 5000) |>
select(genres) |>
distinct()
# Count successes for averageRating (for popular genres)
success_by_genre_rating_popular <- MOVIES_GENRES |>
filter(genres %in% popular_genres$genres) |>
group_by(decade, genres) |>
summarize(success_count = sum(success_averageRating, na.rm = TRUE), .groups = "drop") |>
left_join(total_films_by_genre, by = c("decade", "genres")) |>
mutate(success_rate = success_count / total_count * 100) |>
arrange(decade, desc(success_rate))
# Count successes for metric (for popular genres)
success_by_genre_metric_popular <- MOVIES_GENRES |>
filter(genres %in% popular_genres$genres) |>
group_by(decade, genres) |>
summarize(success_count = sum(success_metric, na.rm = TRUE), .groups = "drop") |>
left_join(total_films_by_genre, by = c("decade", "genres")) |>
mutate(success_rate = success_count / total_count * 100) |>
arrange(decade, desc(success_rate))
# Count successes for averageRating (for less popular genres)
success_by_genre_rating_less <- MOVIES_GENRES |>
filter(!(genres %in% popular_genres$genres)) |>
group_by(decade, genres) |>
summarize(success_count = sum(success_averageRating, na.rm = TRUE), .groups = "drop") |>
left_join(total_films_by_genre, by = c("decade", "genres")) |>
mutate(success_rate = success_count / total_count * 100) |>
arrange(decade, desc(success_rate))
# Count successes for metric (for less popular genres)
success_by_genre_metric_less <- MOVIES_GENRES |>
filter(!(genres %in% popular_genres$genres)) |>
group_by(decade, genres) |>
summarize(success_count = sum(success_metric, na.rm = TRUE), .groups = "drop") |>
left_join(total_films_by_genre, by = c("decade", "genres")) |>
mutate(success_rate = success_count / total_count * 100) |>
arrange(decade, desc(success_rate))
# Plotting Success Rates for Average Rating (Popular Genres)
p1 <- ggplot(success_by_genre_rating_popular, aes(x = decade, y = success_rate, color = genres, group = genres)) +
geom_line(size = 1) + # Add line
geom_point(size = 3) + # Add points for clarity
labs(title = "Success Rates by Genre (Average Rating - Popular Genres)", x = "Decade", y = "Success Rate (%)") +
scale_y_continuous(limits = c(0, 100)) + # Set y-axis limits from 0 to 100
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
plot.title = element_text(hjust = 0.5) # Center the title
)
# Convert ggplot to plotly for hover functionality
p1_interactive <- ggplotly(p1)
# Plotting Success Rates for Metric (Popular Genres)
p2 <- ggplot(success_by_genre_metric_popular, aes(x = decade, y = success_rate, color = genres, group = genres)) +
geom_line(size = 1) + # Add line
geom_point(size = 3) + # Add points for clarity
labs(title = "Success Rates by Genre (Metric - Popular Genres)", x = "Decade", y = "Success Rate (%)") +
scale_y_continuous(limits = c(0, 100)) + # Set y-axis limits from 0 to 100
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
plot.title = element_text(hjust = 0.5) # Center the title
)
# Convert ggplot to plotly for hover functionality
p2_interactive <- ggplotly(p2)
# Plotting Success Rates for Average Rating (Less Popular Genres)
p3 <- ggplot(success_by_genre_rating_less, aes(x = decade, y = success_rate, color = genres, group = genres)) +
geom_line(size = 1) + # Add line
geom_point(size = 3) + # Add points for clarity
labs(title = "Success Rates by Genre (Average Rating - Less Popular Genres)", x = "Decade", y = "Success Rate (%)") +
scale_y_continuous(limits = c(0, 100)) + # Set y-axis limits from 0 to 100
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
plot.title = element_text(hjust = 0.5) # Center the title
)
# Convert ggplot to plotly for hover functionality
p3_interactive <- ggplotly(p3)
# Plotting Success Rates for Metric (Less Popular Genres)
p4 <- ggplot(success_by_genre_metric_less, aes(x = decade, y = success_rate, color = genres, group = genres)) +
geom_line(size = 1) + # Add line
geom_point(size = 3) + # Add points for clarity
labs(title = "Success Rates by Genre (Metric - Less Popular Genres)", x = "Decade", y = "Success Rate (%)") +
scale_y_continuous(limits = c(0, 100)) + # Set y-axis limits from 0 to 100
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
plot.title = element_text(hjust = 0.5) # Center the title
)
# Convert ggplot to plotly for hover functionality
p4_interactive <- ggplotly(p4)p1_interactivep2_interactivep3_interactivep4_interactiveThey’re interactive!
#4.1: Which genre has the most "successs" in each decade"?
# Count successes for averageRating
success_by_genre_rating <- MOVIES_GENRES |>
group_by(decade, genres) |>
summarize(success_count = sum(success_averageRating, na.rm = TRUE), .groups = "drop") |>
arrange(decade, desc(success_count))
# Count successes for metric
success_by_genre_metric <- MOVIES_GENRES |>
group_by(decade, genres) |>
summarize(success_count = sum(success_metric, na.rm = TRUE), .groups = "drop") |>
arrange(decade, desc(success_count))
# Identify the genre with the most successes for each decade (Average Rating)
most_successful_genre_rating <- success_by_genre_rating |>
group_by(decade) |>
slice_max(success_count, n = 2) |>
ungroup()
# Identify the genre with the most successes for each decade (Metric)
most_successful_genre_metric <- success_by_genre_metric |>
group_by(decade) |>
slice_max(success_count, n = 2) |>
ungroup()
most_successful_genre_rating# A tibble: 14 × 3
decade genres success_count
<dbl> <chr> <dbl>
1 1960 Drama 2567
2 1960 Comedy 1287
3 1970 Drama 2990
4 1970 Comedy 1318
5 1980 Drama 3220
6 1980 Comedy 1783
7 1990 Drama 4170
8 1990 Comedy 2184
9 2000 Drama 7339
10 2000 Comedy 3325
11 2010 Drama 11449
12 2010 Documentary 5084
13 2020 Drama 5103
14 2020 Comedy 2174
most_successful_genre_metric# A tibble: 15 × 3
decade genres success_count
<dbl> <chr> <dbl>
1 1960 Drama 1526
2 1960 Adventure 308
3 1960 Romance 308
4 1970 Drama 1890
5 1970 Crime 517
6 1980 Drama 2083
7 1980 Action 600
8 1990 Drama 3258
9 1990 Comedy 888
10 2000 Drama 6342
11 2000 Comedy 1711
12 2010 Drama 10358
13 2010 Action 2553
14 2020 Drama 4501
15 2020 Thriller 1319
Drama and Action films are pretty succesfull!
#4.2 Which genre consistently has the most "successes"? Which genre used to reliably produce "successes" and has fallen out of favor?
p1_interactivep2_interactiveDocumentaries have the most consistent successes.
Horror used to produce a lot of successes but has been on a downward trend each decade.
#4.3: What genre has produced the most “successes” since 2010?
#Does it have the highest success rate or does it only have a large number of successes because there are many productions in that genre?
p1_interactivep2_interactivemost_successful_genre_rating# A tibble: 14 × 3
decade genres success_count
<dbl> <chr> <dbl>
1 1960 Drama 2567
2 1960 Comedy 1287
3 1970 Drama 2990
4 1970 Comedy 1318
5 1980 Drama 3220
6 1980 Comedy 1783
7 1990 Drama 4170
8 1990 Comedy 2184
9 2000 Drama 7339
10 2000 Comedy 3325
11 2010 Drama 11449
12 2010 Documentary 5084
13 2020 Drama 5103
14 2020 Comedy 2174
most_successful_genre_metric# A tibble: 15 × 3
decade genres success_count
<dbl> <chr> <dbl>
1 1960 Drama 1526
2 1960 Adventure 308
3 1960 Romance 308
4 1970 Drama 1890
5 1970 Crime 517
6 1980 Drama 2083
7 1980 Action 600
8 1990 Drama 3258
9 1990 Comedy 888
10 2000 Drama 6342
11 2000 Comedy 1711
12 2010 Drama 10358
13 2010 Action 2553
14 2020 Drama 4501
15 2020 Thriller 1319
Drama seems to be the answer to both of these
#4.4: What genre has become more popular in recent years?
p3_interactivep4_interactiveAction movies
Task 5: Identifying the Crew
Hang on to your hats because there’s about to be a lot of data manipulation.
Basically, we’re going to whittle down the list of actors and directors until we find a group that matches what we’re looking for in producing the next box-office smash hit.
#Filter TITLE_BASICS based on metric threshold and genre
FILTERED_MOVIES <- MOVIES |>
separate_longer_delim(genres, delim = ",") |>
filter(success_metric == 1, genres %in% c("Action", "Drama", "Thriller", "Documentary")) |>
group_by(tconst) |>
summarize(
title = first(primaryTitle), # Adjust according to your dataset
year = first(releaseYear), # Adjust according to your dataset
metric = first(metric), # Adjust according to your dataset,
avgRating = first(averageRating),
numVotes,
decade = first(decade),
castCount = first(castCount),
genres = paste(unique(genres), collapse = ", ") # Combine genres into one string
)
ACTORS <- NAME_BASICS |>
filter(grepl("actor", primaryProfession, ignore.case = TRUE), is.na(deathYear)) |>
separate_longer_delim(primaryProfession, delim = ",") |>
filter(primaryProfession == "actor") |>
mutate(age = 2024 - birthYear) |>
select(-`deathYear`)
# Joining with TITLE_PRINCIPALS
ACTORS_TITLES <- ACTORS |>
inner_join(TITLE_PRINCIPALS, by = c("nconst" = "nconst")) |>
left_join(TITLE_BASICS, by = c("tconst" = "tconst")) |>
inner_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
select(
`nconst`,
`primaryName`,
`age`,
`tconst`,
`primaryTitle`,
releaseYear = `startYear`,
`decade`,
`castCount`,
`runtimeMinutes`,
genres = `genres.x`,
mainGenre = `genres.y`,
`metric`,
`avgRating`
) |>
distinct()
DIRECTORS <- NAME_BASICS |>
filter(grepl("director", primaryProfession, ignore.case = TRUE), is.na(deathYear)) |>
separate_longer_delim(primaryProfession, delim = ",") |>
filter(primaryProfession == "director") |>
mutate(age = 2024 - birthYear) |>
select(-`deathYear`)
# Joining with TITLE_PRINCIPALS
DIRECTORS_TITLES <- DIRECTORS |>
inner_join(TITLE_PRINCIPALS, by = c("nconst" = "nconst")) |>
left_join(TITLE_BASICS, by = c("tconst" = "tconst")) |>
inner_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
select(
`nconst`,
`primaryName`,
`age`,
`tconst`,
`primaryTitle`,
releaseYear = `startYear`,
`decade`,
`castCount`,
`runtimeMinutes`,
genres = `genres.x`,
mainGenre = `genres.y`,
`metric`,
`avgRating`
) |>
distinct()
# Let's get anyone who's been in a film since at least 2015
LAST_TEN_YEARS_ACTORS <- ACTORS_TITLES |>
filter(releaseYear >= 2015)
LAST_TEN_YEARS_DIRECTORS <- DIRECTORS_TITLES |>
filter(releaseYear >= 2015)
# since Action and Drama are the best genres, let's subset to those
BEST_GENRES_ACTORS <- LAST_TEN_YEARS_ACTORS |>
filter(mainGenre %in% c("Action", "Drama"))
BEST_GENRES_DIRECTORS <- LAST_TEN_YEARS_DIRECTORS |>
filter(mainGenre %in% c("Action", "Drama"))
# For actors, let's get people who can work with big supporting casts.
# We don't know if that's what we want, but it'll be nice to have
cast_threshold <- BEST_GENRES_ACTORS |>
summarize(threshold = quantile(castCount, 0.95, na.rm = TRUE)) |>
pull(threshold)
ACTORS_CREWS <- BEST_GENRES_ACTORS |>
group_by(nconst) |>
summarize(median_castCount = median(castCount, na.rm = TRUE)) |>
filter(median_castCount >= cast_threshold) |> # Filter for median castCount in the top 5 percentilE
ungroup()
ACTORS_CAST <- BEST_GENRES_ACTORS |>
inner_join(ACTORS_CREWS, by = c("nconst" = "nconst")) |>
select(
-`median_castCount`
) |>
distinct()
# Let's pull their best works only
BEST_FILMS_ACTORS <- ACTORS_CAST |>
group_by(nconst) |>
filter(avgRating == max(avgRating, na.rm = TRUE)) |>
ungroup()
# For actors, let's narrow it down to actors whos best film is from this decade
CURRENT_BEST_ACTORS <- BEST_FILMS_ACTORS |>
filter(decade==2020)
# We'll now whittle down directors. Who can get a runtime in our sweet-spot of 70 to 150 minutes?
filtered_median_runtime <- BEST_GENRES_DIRECTORS |>
group_by(nconst) |>
summarize(median_runtime = median(runtimeMinutes, na.rm = TRUE)) |> # Calculate median runtime
filter(median_runtime >= 70 & median_runtime <= 150) # Filter for median runtime between 70 and 150
# Doing the same but for directors
DIRECTORS_RUNTIME <- BEST_GENRES_DIRECTORS |>
inner_join(filtered_median_runtime, by = c("nconst" = "nconst"))
# let's pull their best movies only
BEST_FILMS_DIRECTORS <- DIRECTORS_RUNTIME |>
group_by(nconst) |>
filter(avgRating == max(avgRating, na.rm = TRUE)) |>
ungroup()
# Let's remove any director who is also an actor
NON_ACTOR_DIRECTORS <- BEST_FILMS_DIRECTORS |>
anti_join(BEST_FILMS_ACTORS, by = c("nconst" = "nconst"))
# For directors, let's narrow it down to actors whos best film is from this decade
CURRENT_BEST_DIRECTORS <- NON_ACTOR_DIRECTORS |>
filter(decade==2020)dim(NON_ACTOR_DIRECTORS)[1] 15530 14
dim(BEST_FILMS_ACTORS)[1] 1371 13
Taking a quick breather! We just got out data down quite a bit. Let’s keep going.
MOST_POPULAR_ACTORS <- CURRENT_BEST_ACTORS |>
left_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
select(
`nconst`,
`primaryName`,
`age`,
`primaryTitle`,
`year`,
`mainGenre`,
avgRating = `avgRating.x`,
`numVotes`
) |>
arrange(desc(numVotes))
# Let's also bring in how many films they've worked on
ACTOR_FILM_CT <- TITLE_PRINCIPALS |>
filter(category == "actor") |>
group_by(nconst) |>
summarize(film_count = n())
MOST_POPULAR_ACTORS <- MOST_POPULAR_ACTORS |>
left_join(ACTOR_FILM_CT, by = c("nconst" = "nconst"))
MOST_POPULAR_ACTORS <- MOST_POPULAR_ACTORS |>
filter(!is.na(film_count))
head(MOST_POPULAR_ACTORS)# A tibble: 6 × 9
nconst primaryName age primaryTitle year mainGenre avgRating numVotes
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 nm8188622 Jacob Batalon 28 Spider-Man:… 2021 Action 8.2 910046
2 nm1069587 Darshan Kuma… 38 The Kashmir… 2022 Drama 8.6 575254
3 nm12220302 Sourav Verma NA The Kashmir… 2022 Drama 8.6 575254
4 nm1282966 O-T Fagbenle 43 Black Widow 2021 Action 6.6 437506
5 nm0000439 Neil Patrick… 51 The Matrix … 2021 Action 5.6 286445
6 nm1186373 Adam Sztykiel 46 Black Adam 2022 Action 6.2 282231
# ℹ 1 more variable: film_count <int>
# Step 1: Join MOST_POPULAR_ACTORS with TITLE_PRINCIPALS to get tconst (movies the actor was part of)
actor_movies <- MOST_POPULAR_ACTORS |>
inner_join(TITLE_PRINCIPALS, by = "nconst") |>
select(nconst, tconst)
# Step 2: Join actor_movies with MOVIES to get the average rating of each movie
actor_movie_ratings <- actor_movies |>
inner_join(MOVIES, by = "tconst") |>
select(nconst, tconst, averageRating)
# Step 3: Calculate the average rating for each actor across their movies
actor_avg_rating <- actor_movie_ratings |>
group_by(nconst) |>
summarize(avg_rating = mean(averageRating, na.rm = TRUE))
# Step 4: Join the calculated average rating back to MOST_POPULAR_ACTORS
MOST_POPULAR_ACTORS <- MOST_POPULAR_ACTORS |>
left_join(actor_avg_rating, by = "nconst")
MOST_POPULAR_ACTORS <- MOST_POPULAR_ACTORS |>
select(
`nconst`,
`primaryName`,
`age`,
`primaryTitle`,
`year`,
`mainGenre`,
avgMainMovieRating = `avgRating`,
`numVotes`,
filmCount = `film_count`,
avgActorRating = `avg_rating`
)
MOST_POPULAR_DIRECTORS <- CURRENT_BEST_DIRECTORS |>
left_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
select(
`nconst`,
`primaryName`,
`age`,
`primaryTitle`,
`year`,
`mainGenre`,
avgRating = `avgRating.x`,
`numVotes`
) |>
arrange(desc(numVotes))
# Let's also bring in how many films they've worked on
DIRECTOR_FILM_CT <- TITLE_PRINCIPALS |>
filter(category == "director") |>
group_by(nconst) |>
summarize(film_count = n())
MOST_POPULAR_DIRECTORS <- MOST_POPULAR_DIRECTORS |>
left_join(DIRECTOR_FILM_CT, by = c("nconst" = "nconst"))
MOST_POPULAR_DIRECTORS <- MOST_POPULAR_DIRECTORS |>
filter(!is.na(film_count))
# Step 1: Join MOST_POPULAR_DIRECTORS with TITLE_PRINCIPALS to get tconst (movies the director was part of)
director_movies <- MOST_POPULAR_DIRECTORS |>
inner_join(TITLE_PRINCIPALS, by = "nconst") |>
select(nconst, tconst)
# Step 2: Join director_movies with MOVIES to get the average rating of each movie
director_movies_ratings <- director_movies |>
inner_join(MOVIES, by = "tconst") |>
select(nconst, tconst, averageRating)
# Step 3: Calculate the average rating for each director across their movies
director_avg_rating <- director_movies_ratings |>
group_by(nconst) |>
summarize(avg_rating = mean(averageRating, na.rm = TRUE))
# Step 4: Join the calculated average rating back to MOST_POPULAR_DIRECTORS
MOST_POPULAR_DIRECTORS <- MOST_POPULAR_DIRECTORS |>
left_join(director_avg_rating, by = "nconst")
MOST_POPULAR_DIRECTORS <- MOST_POPULAR_DIRECTORS |>
select(
`nconst`,
`primaryName`,
`age`,
`primaryTitle`,
`year`,
`mainGenre`,
avgMainMovieRating = `avgRating`,
`numVotes`,
filmCount = `film_count`,
avgDirectorRating = `avg_rating`
)# Results!
head(MOST_POPULAR_ACTORS)# A tibble: 6 × 10
nconst primaryName age primaryTitle year mainGenre avgMainMovieRating
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 nm8188622 Jacob Batalon 28 Spider-Man:… 2021 Action 8.2
2 nm1069587 Darshan Kuma… 38 The Kashmir… 2022 Drama 8.6
3 nm12220302 Sourav Verma NA The Kashmir… 2022 Drama 8.6
4 nm1282966 O-T Fagbenle 43 Black Widow 2021 Action 6.6
5 nm0000439 Neil Patrick… 51 The Matrix … 2021 Action 5.6
6 nm1186373 Adam Sztykiel 46 Black Adam 2022 Action 6.2
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgActorRating <dbl>
dim(MOST_POPULAR_ACTORS)[1] 474 10
head(MOST_POPULAR_DIRECTORS)# A tibble: 6 × 10
nconst primaryName age primaryTitle year mainGenre avgMainMovieRating
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 nm1218281 Jon Watts 43 Spider-Man: N… 2021 Action 8.2
2 nm0001282 Tony Goldwyn 64 Oppenheimer 2023 Drama 8.3
3 nm1347153 Tyler Perry 55 Don't Look Up 2021 Drama 7.2
4 nm0411539 Puneet Issar 65 The Kashmir F… 2022 Drama 8.6
5 nm0426059 Rian Johnson 51 Glass Onion 2022 Drama 7.1
6 nm0781913 Noah Segan 41 Glass Onion 2022 Drama 7.1
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgDirectorRating <dbl>
dim(MOST_POPULAR_DIRECTORS)[1] 3254 10
# Identify the directors with the highest values for numVotes, filmcount, and avgDirectorRating
highest_votes <- MOST_POPULAR_DIRECTORS |> filter(numVotes == max(numVotes, na.rm = TRUE))
highest_film_count <- MOST_POPULAR_DIRECTORS |> filter(filmCount == max(filmCount, na.rm = TRUE))
highest_avg_rating <- MOST_POPULAR_DIRECTORS |> filter(avgDirectorRating == max(avgDirectorRating, na.rm = TRUE))
# Combine the labels into one data frame
labels_df <- bind_rows(highest_votes, highest_film_count, highest_avg_rating)
# Create the scatter plot
ggplot(MOST_POPULAR_DIRECTORS, aes(x = filmCount, y = numVotes, color = avgDirectorRating)) +
geom_point(size = 3) + # Add points with size 3
scale_color_gradient(low = "blue", high = "red") + # Color gradient from blue (low) to red (high)
scale_y_continuous(labels = scales::comma, limits = c(0, 500000)) + # Adjust y-axis to max at 500,000
labs(
title = "Scatter Plot of Directors' Film Count vs Number of Votes",
x = "Film Count",
y = "Number of Votes",
color = "Avg Director Rating" # Label for the color legend
) +
geom_text(data = labels_df, aes(label = paste0(" ", primaryName)),
vjust = -1, color = "black", size = 4, font = "bold") + # Add labels for the highest points
theme_minimal(base_size = 15) + # Increase font size for better readability
theme(
plot.background = element_rect(fill = "darkgray"),
panel.background = element_rect(fill = "darkgray"),
axis.line = element_line(color = "white", size = 1.2),
axis.text = element_text(color = "white"), # Set axis text color to white
axis.title = element_text(color = "white", face = "bold"), # Bolden axis titles
plot.title = element_text(hjust = 0.5, color = "white", face = "bold"), # Center the title and bold it
axis.text.x = element_text(angle = 45, hjust = 1) # Rotate x-axis labels if needed
)MOST_POPULAR_DIRECTORS# A tibble: 3,254 × 10
nconst primaryName age primaryTitle year mainGenre avgMainMovieRating
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 nm1218281 Jon Watts 43 Spider-Man:… 2021 Action 8.2
2 nm0001282 Tony Goldwyn 64 Oppenheimer 2023 Drama 8.3
3 nm1347153 Tyler Perry 55 Don't Look … 2021 Drama 7.2
4 nm0411539 Puneet Issar 65 The Kashmir… 2022 Drama 8.6
5 nm0426059 Rian Johnson 51 Glass Onion 2022 Drama 7.1
6 nm0781913 Noah Segan 41 Glass Onion 2022 Drama 7.1
7 nm3725055 Jessica Henw… NA Glass Onion 2022 Drama 7.1
8 nm0072600 Ned Benson 47 Black Widow 2021 Action 6.6
9 nm0075244 Gabriel Beri… 75 Black Widow 2021 Action 6.6
10 nm0795153 Cate Shortla… 56 Black Widow 2021 Action 6.6
# ℹ 3,244 more rows
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgDirectorRating <dbl>
# Rian JohnsonMOST_POPULAR_ACTORS |>
filter(filmCount >= 10, mainGenre == "Action") |>
arrange(desc(avgActorRating)) |>
print(n=50) # A tibble: 62 × 10
nconst primaryName age primaryTitle year mainGenre avgMainMovieRating
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 nm2128363 Paul Briggs 50 Raya and th… 2021 Action 7.3
2 nm4514080 Bradley Cam… NA Psycho-Pass… 2023 Action 7
3 nm0239504 Bruce DuBose 64 Psycho-Pass… 2023 Action 7
4 nm3139597 Shawn Gann 43 Psycho-Pass… 2023 Action 7
5 nm5211370 Justin Brin… 33 Black Clove… 2023 Action 7.4
6 nm2995162 Artt Butler 55 Mortal Komb… 2022 Action 6.7
7 nm0254602 R. Bruce El… 75 Psycho-Pass… 2023 Action 7
8 nm11111075 A.J. Beckles 26 Mobile Suit… 2024 Action 6.8
9 nm0000316 Mel Brooks 98 Paws of Fur… 2022 Action 5.7
10 nm2616557 Ray Chase 37 Demon Slaye… 2023 Action 7.6
11 nm0954900 Zeno Robins… 31 Mobile Suit… 2024 Action 6.8
12 nm13506312 Alex Hom NA New Gods: Y… 2022 Action 6.6
13 nm0136482 Lorne Cardi… 60 Four Souls … 2023 Action 7.2
14 nm6770060 Bill Butts 38 Pokémon the… 2020 Action 6.4
15 nm3739133 Bayardo De … NA Mortal Komb… 2021 Action 6.5
16 nm0089707 Steve Blum 64 Black Clove… 2023 Action 7.4
17 nm0564402 Simon McBur… 67 Wolfwalkers 2020 Action 8
18 nm6981965 Kellen Goff NA Pokémon the… 2020 Action 6.4
19 nm0196654 Daniel Dae … NA Raya and th… 2021 Action 7.3
20 nm0443632 Thierry Kaz… NA Miraculous:… 2023 Action 6.1
21 nm0005458 Jason Stath… 57 Fast X 2023 Action 5.7
22 nm0010075 Jensen Ackl… 46 Batman: The… 2021 Action 7.2
23 nm0010075 Jensen Ackl… 46 Batman: The… 2021 Action 7.2
24 nm8188622 Jacob Batal… 28 Spider-Man:… 2021 Action 8.2
25 nm0224007 John DiMagg… 56 Batman: The… 2021 Action 7.2
26 nm1716941 Martial Le … NA Miraculous:… 2023 Action 6.1
27 nm0541902 Aasif Mandvi 58 Paws of Fur… 2022 Action 5.7
28 nm1977856 Chin Han 55 Mortal Komb… 2021 Action 6
29 nm3112453 Mohammed Am… 43 Black Adam 2022 Action 6.2
30 nm1684869 Troy Baker 48 Batman: The… 2021 Action 7.2
31 nm1684869 Troy Baker 48 Batman: The… 2021 Action 7.2
32 nm0071680 Clé Bennett NA Four Souls … 2023 Action 7.2
33 nm2642131 Zach Callis… 27 Batman: The… 2021 Action 7.2
34 nm0235960 Robin Atkin… 48 Mortal Komb… 2021 Action 6.5
35 nm2022559 Dave B. Mit… 55 Mortal Komb… 2021 Action 6.5
36 nm0862916 Tommy Tiern… 55 Wolfwalkers 2020 Action 8
37 nm0000408 Jonathan Fr… 72 Catwoman: H… 2022 Action 5.6
38 nm4464492 Dakota Daul… NA The Last Vi… 2021 Action 5
39 nm2941559 Ike Amadi NA Mortal Komb… 2021 Action 6.5
40 nm0000439 Neil Patric… 51 The Matrix … 2021 Action 5.6
41 nm0146097 James Carte… 70 Pokémon the… 2020 Action 6.4
42 nm6299248 Bowen Yang 34 The Tiger's… 2024 Action 5.7
43 nm0570364 Joel McHale 53 Mortal Komb… 2021 Action 6.5
44 nm0022771 Steve Alter… 61 Trollhunter… 2021 Action 6.6
45 nm0273159 Alex Ferns 56 Godzilla x … 2024 Action 6.1
46 nm1189034 Bharat Dabh… 55 Ek Villain … 2022 Action 4.4
47 nm1167985 Lewis Tan NA Mortal Komb… 2021 Action 6
48 nm1282966 O-T Fagbenle 43 Black Widow 2021 Action 6.6
49 nm0152059 Vic Chao NA Trollhunter… 2021 Action 6.6
50 nm0389491 Bosco Hogan 75 The Legion 2020 Action 3.2
# ℹ 12 more rows
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgActorRating <dbl>
# Jacob Batalon and John DiMaggio
MOST_POPULAR_ACTORS |>
filter(primaryName %in% c("Jacob Batalon","John DiMaggio"))# A tibble: 2 × 10
nconst primaryName age primaryTitle year mainGenre avgMainMovieRating
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 nm8188622 Jacob Batalon 28 Spider-Man: … 2021 Action 8.2
2 nm0224007 John DiMaggio 56 Batman: The … 2021 Action 7.2
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgActorRating <dbl>
Task 6: Remaking a classic
FILTERED_MOVIES |>
filter(year <= 1998, grepl("action",genres,ignore.case=TRUE)) |>
arrange(desc(numVotes)) |>
unique() |>
print(n=30)# A tibble: 2,120 × 9
# Groups: tconst [2,120]
tconst title year metric avgRating numVotes decade castCount genres
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <chr>
1 tt0076759 Star Wars:… 1977 0.8 8.6 1474565 1970 24 Action
2 tt0080684 Star Wars:… 1980 0.8 8.7 1405682 1980 24 Action
3 tt0110413 Léon: The … 1994 1 8.5 1269895 1990 17 Actio…
4 tt0103064 Terminator… 1991 0.8 8.6 1198273 1990 22 Action
5 tt0086190 Star Wars:… 1983 0.8 8.3 1140230 1980 23 Action
6 tt0107290 Jurassic P… 1993 0.8 8.2 1089588 1990 21 Action
7 tt0082971 Raiders of… 1981 0.8 8.4 1051637 1980 23 Action
8 tt0095016 Die Hard 1988 0.8 8.2 960964 1980 22 Actio…
9 tt0088247 The Termin… 1984 0.8 8.1 942692 1980 19 Action
10 tt0083658 Blade Runn… 1982 1 8.1 836437 1980 22 Actio…
11 tt0097576 Indiana Jo… 1989 0.8 8.2 822476 1980 24 Action
12 tt0090605 Aliens 1986 0.8 8.4 787505 1980 20 Action
13 tt0113277 Heat 1995 0.6 8.3 733993 1990 22 Actio…
14 tt0116629 Independen… 1996 0.8 7 616858 1990 20 Action
15 tt0087469 Indiana Jo… 1984 0.8 7.5 542540 1980 23 Action
16 tt0119116 The Fifth … 1997 0.8 7.6 515969 1990 19 Action
17 tt0092099 Top Gun 1986 1 6.9 513973 1980 22 Actio…
18 tt0117060 Mission: I… 1996 0.8 7.2 478193 1990 22 Actio…
19 tt0119567 The Lost W… 1997 0.8 6.6 450702 1990 21 Action
20 tt0119698 Princess M… 1997 0.8 8.3 444550 1990 29 Action
21 tt0112864 Die Hard w… 1995 0.8 7.6 413230 1990 20 Actio…
22 tt0096895 Batman 1989 0.8 7.5 411928 1980 23 Action
23 tt0119094 Face/Off 1997 0.8 7.3 407477 1990 23 Action
24 tt0111257 Speed 1994 0.8 7.3 399472 1990 19 Actio…
25 tt0099423 Die Hard 2 1990 0.8 7.1 387990 1990 24 Actio…
26 tt0117500 The Rock 1996 0.8 7.4 363620 1990 23 Actio…
27 tt0100802 Total Reca… 1990 0.8 7.5 360446 1990 25 Action
28 tt0103776 Batman Ret… 1992 0.8 7.1 333509 1990 24 Action
29 tt0120201 Starship T… 1997 0.8 7.3 328003 1990 22 Action
30 tt0106977 The Fugiti… 1993 1 7.8 325363 1990 24 Actio…
# ℹ 2,090 more rows
# Independence Day seems like a good choice, let's make sure there's only one of it though
FILTERED_MOVIES |>
filter(title == "Independence Day")# A tibble: 2 × 9
# Groups: tconst [2]
tconst title year metric avgRating numVotes decade castCount genres
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <chr>
1 tt0085724 Independenc… 1983 1 6.5 585 1980 21 Drama
2 tt0116629 Independenc… 1996 0.8 7 616858 1990 20 Action
Looks like we’re good to go with this remake using this cast and crew!